I can use PyMC3 to estimate the posterior PDF for the true skill level of every BUDA team using its W-L record during the season. This could be superior to my initial approach of assuming a discrete skill level based on Division and then modifying that skill level based on individual game outcomes. It will also serve as a great playground in which to explore PyMC3.
Before I jump in with actual BUDA data, I want to test things out with a small sample of artificial game data.
In [1]:
import pandas as pd
import os
import numpy as np
import pymc3 as pm
from pymc3.math import invlogit
import theano.tensor as tt
%matplotlib inline
In [3]:
n_teams = 67
teams = range(n_teams)
In [70]:
true_skill = {}
for team in teams:
true_skill[team] = np.random.normal(loc=1600, scale=200)
In [71]:
true_skill
Out[71]:
In [72]:
n_games = 520
games = range(n_games)
database = []
for game in games:
game_database = {}
matchup = np.random.choice(teams, size=2, replace=False)
game_database['Team A'] = matchup[0]
game_database['Team B'] = matchup[1]
true_skills = [true_skill[matchup[0]], true_skill[matchup[1]]]
game_skills = np.random.normal(loc=true_skills, scale=100)
outcome_A = game_skills[0] > game_skills[1]
game_database['Winner is A'] = outcome_A
database.append(game_database)
In [73]:
game_results = pd.DataFrame(database)
In [74]:
game_results.head(10)
Out[74]:
In [75]:
project_dir = '/Users/rbussman/Projects/BUDA/buda-ratings'
scores_dir = os.path.join(project_dir, 'data', 'raw', 'game_scores')
game_results.to_csv(os.path.join(scores_dir, 'artificial_scores_big.csv'))
Prior on each team is a normal distribution with mean of 0 and standard deviation of 1.
In [76]:
game_results.shape
Out[76]:
In [77]:
with pm.Model() as model:
skill = pm.Normal('skill', mu=0, sd=1, shape=n_teams)
B_minus_A = skill[game_results['Team B'].values] - skill[game_results['Team A'].values]
# Avoid 0 or 1
lower = 1e-6
upper = 1 - 1e-6
probability_A_beats_B = lower + (upper - lower) * 1 / (1 + tt.exp(B_minus_A))
observation = pm.Bernoulli('observation', probability_A_beats_B, observed=game_results['Winner is A'].values)
In [78]:
with model:
trace = pm.sample(1000)
In [64]:
pm.traceplot(trace)
Out[64]:
In [65]:
trace.varnames
Out[65]:
In [79]:
trace['skill'].shape
Out[79]:
In [80]:
estimatedskills = trace['skill'].mean(axis=0)
In [81]:
for key in true_skill:
print("True: {:.2f}; Estimated: {:.2f}".format((true_skill[key] - 1600) / 100, estimatedskills[key]))
In [22]:
key
Out[22]:
In [ ]: